/* File: cln_data.do
 * Author: Luca Maini
 * Purpose: import the and create the data used in the merger project. The 
 *			output consists of 3 datasets:
 *				
 *				1. 	ProductLevelSalesSSR: sales at the product-year level
 *				2. 	ProductCompanyLevelNetSalesSSR: sales at the product-year-
 *					company level
 *				3.	appendedAcquisitionData: data on all acquisitions, combining
 *					SSR and SDC
 *			
 *			Other datasets are also created, but only as inputs to those main 3.
 *			The first two datasets are further re-elaborated in subsequent 
 *			scripts. The third one is analyzed manually to find matches, and the
 *			the resulting output is ultimately saved in an excel spreadsheet 
 *			called "SDC_SSR_matchlist", which is the main file used to build the 
 *			acquisition data.
 *
 * Date Created: 07/30/2020
 *
 */

/*	This script imports data from a variety of sources:
	
		1. 	Product level data from SSR Health on drug sales
		2. 	Product-Company level data from SSR Health on drug net sales
		3. 	Data from SSR Health on acquisitions in the pharmaceutical market
		4.	Data from SDC Platinum on acquisitions in the pharmaceutical market
*/	



/////////////////////////////////////////////////////////////
/////													/////
/////	0. Import and clean the raw SSR Health files	/////
/////													/////
/////////////////////////////////////////////////////////////


	
/* 	The new SSR data does not provide direct information on invoice sales or 
	units, so we have to run a few additional operations in order to deduce that
	information.
	
	Our data now has net_sales at the product-strength level, as well as WAC and
	net_price at the product-strength level. We use this data to deduce units 
	and invoice sales which we then aggregate at the product level.
	
	In doing this, we want to check (using our previous version of the data) 
	that we are doing the aggregation correctly.
*/

/*	0. Import the old version of the data	*/

* NOTE: check that the range is correct by looking up the spreadsheet
import excel "${inputdir}\US brand Rx net price tool _3Q19_dataonly.xlsx", ///
	sheet("Product data") cellrange(B7:RI1313) firstrow clear

drop if Product == ""	// empty lines

foreach var of varlist _all {
	capture confirm string var `var'
	if _rc != 0 {
		qui sum `var'
		if r(max) == . | (r(max) == 0 & r(min) == 0) {
			disp "`var'"
			drop `var'
			}
		}
	}

* Capitalize string variables
replace Product = trim(upper(Product))
replace Molecule = trim(upper(Molecule))
replace Company = trim(upper(Company))
replace Class = trim(upper(Class))

gen biologic = Size == "Large"
drop Size

rename Marketstart mkt_start
rename Marketend mkt_end
rename Exclude exclude
	
* Reshape to get a panel dataset
reshape long net_sales WAC_sales units WAC_price net_price tot_discount ///
			 medicaid_discount non_medicaid_discount, ///
	i(Company Product Molecule biologic Class ///
	  mkt_start mkt_end LOE exclude) j(qrt) string

* create numeric quarter variable
gen quarter = quarterly(qrt,"Q20Y")
format quarter %tq
drop qrt

* replace sales and units with missing values whenever it is set to 0
replace net_sales = . if net_sales == 0
replace WAC_sales = . if WAC_sales == 0
replace units = . if units == 0

* Drop vaccines
bysort Product : egen vaccine = max(regexm(Molecule, "VACC"))
drop if vaccine == 1
drop vaccine


* Drop variables we will recalculate
// drop WAC_price

compress
save "${outdir}\main_SSR_3Q19.dta", replace



/*	1. Open and merge the two main SSR Health Databases	*/

import excel "${inputdir}\product_strength_price_2020Q2.xlsx", ///
	firstrow clear

* minor variable edits
destring Unitsdose, replace
destring Dosesday, replace
destring DaysyrorTxcourse, replace 

gen quarter = quarterly(QQYYCombined,"YQ")
format quarter %tq
drop QQYYCombined

replace Product = upper(Product)
replace Strength = upper(Strength)

replace MeasureNames = "net_price" if MeasureNames == "Net/Unit"
replace MeasureNames = "WAC_price" if MeasureNames == "WAC/Unit"

* reshape to get a Product-Strength-quarter panel
reshape wide MeasureValues, i(Product Strength quarter) j(MeasureNames) string
rename MeasureValuesnet_price net_price
rename MeasureValuesWAC_price WAC_price

tempfile prices
save "`prices'", replace

* Now import the net_sales data
import excel "${inputdir}\SSR_Net_sales_est_detail_2Q20_dataonly.xlsx", ///
	firstrow clear

* minor variable edits
rename Strengthform Strength
rename Class Class_sales

replace Product = upper(Product)
replace Strength = upper(Strength)

* reshape to get a panel
reshape long net_sales, i(Product Strength) j(q) string

replace net_sales = trim(net_sales)
replace net_sales = "" if net_sales == "-"
destring net_sales, replace
drop if net_sales == .

gen quarter = quarterly(q, "Q20Y")
format quarter %tq
drop q

* drop variables we don't need
drop Ticker

merge 1:1 Product Strength quarter using "`prices'"

gen units = net_sales/net_price
gen WAC_sales = WAC_price * units


keep Product Strength quarter net_sales WAC_sales units
order Product Strength quarter
sort Product quarter Strength

foreach var in net_sales WAC_sales units {
	rename `var' `var'_post2Q20
	}

* aggregate at the product level
collapse (sum) net_sales_post2Q20 units_post2Q20 WAC_sales_post2Q20, ///
	by(Product quarter)

merge 1:1 Product quarter using "${outdir}\main_SSR_3Q19.dta"

keep Product quarter net_sales WAC_sales units ///
	net_sales_post2Q20 units_post2Q20 WAC_sales_post2Q20 ///
	mkt_start mkt_end LOE _merge


* create a unique variable for net sales

* there are a lot of differences between the new and old net sales data, but the 
* only one that seems like a clear error is Symilin in 2014Q1, which has a value
* that seems totally out of whack. We use the old data value for that 
* observation only.

* The only other instance where we use old data is for drugs that have very low
* sales, and where we worry that rounding might lead to issues.
* We use the following criteria:
*	
*	1.	Replace all numbers lower than 0.001 with the unrounded number from the 
*		old data
*	2.	For numbers <= than 0.01, replace if log difference <= than 0.7
*	3.	For numbers <= than 0.1, replace if log difference <= than 0.4
*	4. 	For numbers <= than 1 replace only if average log difference <= than 0.1
*		and log difference <= than 0.1

gen net_salesX = net_sales_post2Q20
replace net_salesX = net_sales if Product == "SYMLIN" & quarter == tq(2014q1)

gen temp = net_sales_post2Q20 if net_sales != .
bysort Product : egen avg_sales_post2Q20 = mean(temp)
drop temp
bysort Product (quarter) : egen avg_sales_round = mean(round(net_sales*1000)/1000)

gen avg_diff = abs(log(avg_sales_post2Q20) - log(avg_sales_round))
gen diff = abs(log(net_sales_post2Q20) - log(net_sales))


replace net_salesX = net_sales if net_sales_post2Q20 <= 0.001 & net_sales != .
replace net_salesX = net_sales if net_sales_post2Q20 <= 0.01 & net_sales != . & diff < 0.7
replace net_salesX = net_sales if net_sales_post2Q20 <= 0.1 & net_sales != . & diff < 0.4
replace net_salesX = net_sales if net_sales_post2Q20 <= 1 & net_sales != . & diff < 0.1 & avg_diff < 0.1


* create a unique variable for WAC_sales & units
* These variables are likely much more reliable post-2Q20, so we only operate
* one change, to address what looks like a very out of line value. 
gen WAC_salesX = WAC_sales_post2Q20
replace net_salesX = net_sales if Product == "VIMOVO" & quarter == tq(2010q3)
gen unitsX = units_post2Q20
replace unitsX = units if Product == "VIMOVO" & quarter == tq(2010q3)


* now we fill in the missing data for WAC sales. In the new dataset, we only get
* WAC_sales info if the drug also has net_sales info. We fill in the gaps using
* regressions.
gen imputation_flag = _merge == 2 & (WAC_sales != . | net_sales != .)
bysort Product : egen has_missing_post2Q20 = max(imputation_flag)

* run simple linear regression without a constant term to fill in missing values
* (this is basically an adjustment by the average relative difference between 
* the two series). 
gen WAC_sales_hat = .

levelsof Product if has_missing_post2Q20 == 1, local(prodImputList)

foreach prod of local prodImputList {
	disp "`prod'"

	qui capture reg WAC_salesX WAC_sales if Product == "`prod'", noconst
	if _rc != 0 continue	
	qui predict temp if Product == "`prod'"
	qui replace WAC_sales_hat = temp if Product == "`prod'"
	qui drop temp
	}

replace WAC_salesX = WAC_sales_hat if WAC_salesX == . & WAC_sales_hat != .


* repeat for units
gen units_hat = .

foreach prod of local prodImputList {
	disp "`prod'"

	qui capture reg unitsX units if Product == "`prod'", noconst
	if _rc != 0 continue	
	qui predict temp if Product == "`prod'"
	qui replace units_hat = temp if Product == "`prod'"
	qui drop temp
	}

replace unitsX = units_hat if unitsX == . & units_hat != .
drop units_hat


* CHECKS (all these should be 0)
count if unitsX != . & WAC_salesX == .
count if unitsX == . & WAC_salesX != .

replace imputation_flag = 0 if WAC_salesX == .

* save
keep Product quarter net_sales* WAC_sales* units* imputation_flag
sort Product quarter

save "${outdir}/SSR_sales_master_2Q20.dta", replace


* now import discount rates
import delimited "${inputdir}\discount_rate_estimates_spreadsheet.csv", clear
rename ïproduct Product
replace Product = trim(upper(Product))
label var Product ""

drop class companies
duplicates drop

gen quarter = quarterly(qqyycombined, "YQ")
format quarter %tq
drop qqyycombined

replace measurenames = trim(measurenames)
replace measurenames = "non_med_discount" if measurenames == "Non-Medicaid WAC-net discount %"
replace measurenames = "med_discount" if measurenames == "Medicaid WAC-net discount %"
replace measurenames = "tot_discount" if measurenames == "Total WAC-net discount %"

reshape wide measurevalues, i(Product quarter) j(measurenames) string

foreach var in med_discount non_med_discount tot_discount {
	rename measurevalues`var' `var'
	}

* save as separate dataset
save "${outdir}/discount_rates_2Q20.dta", replace
	


* now merge with the old and new data to add product characteristics

* for old product characterstics, use old data (more complete)
use "${outdir}\main_SSR_3Q19.dta", clear
keep Product Company Molecule biologic Class mkt_start mkt_end LOE
duplicates drop

merge 1:m Product using "${outdir}/SSR_sales_master_2Q20.dta", nogen ///
	keepusing(Product quarter net_salesX WAC_salesX unitsX imputation_flag)

sort Product quarter
order Product quarter

tempfile temp
save "`temp'", replace

* keep only list of unmatched products
keep if Company == ""
keep Product
duplicates drop

tempfile product_list
save "`product_list'", replace
 
* for new products, use the net_sales spreadsheet
import excel "${inputdir}\SSR_Net_sales_est_detail_2Q20_dataonly.xlsx", ///
	firstrow clear

drop net_sales* Ticker
replace Product = upper(Product)

merge m:1 Product using "`product_list'", keep(using match) nogen

replace Company = upper(Company)
replace Molecule = upper(Molecule)
replace Class = upper(Class)

gen mkt_start = date(Marketstart, "MDY")
gen mkt_end = date(Marketend, "MDY")
format mkt_start mkt_end %td
drop Marketstart Marketend

gen biologic = Size == "Large"
drop Size

* Harmonize at the product level
drop Strength
duplicates drop

* collapse to the earliest market start date and latest market end
collapse (min) mkt_start (max) mkt_end, by(Product Molecule Class Company LOE biologic)

* at this point we have several vaccines that have many molecule names plus 
* KISQUALI which is recorded with two molecule names
replace Molecule = "RIBOCICLIB SUCCINATE" if Product == "KISQALI"

* for vaccines, we just pick one at random
bysort Product (Molecule) : replace Molecule = Molecule[1]

* Collapse one last time
collapse (min) mkt_start (max) mkt_end, by(Product Molecule Class Company LOE biologic)

* now append to the SSR_3Q19 data to do a single merge
append using "${outdir}\main_SSR_3Q19.dta"

keep Company Product Molecule Class LOE biologic mkt_start mkt_end
duplicates drop

merge 1:m Product using "${outdir}/SSR_sales_master_2Q20.dta", nogen ///
	keepusing(Product quarter net_salesX WAC_salesX unitsX imputation_flag)

* now finally merge with discount rates
merge 1:1 Product quarter using "${outdir}/discount_rates_2Q20.dta", nogen

* sanity edits to market start and end dates
replace mkt_start = . if mkt_start == td(31dec1899)
replace mkt_end = . if mkt_end == td(31dec9999) | mkt_end == td(31dec1899)


sort Product quarter
order Product quarter

rename WAC_salesX WAC_sales
rename net_salesX net_sales
rename unitsX units

* MANUAL EDITS TO LOE AND MARKET END DATES
replace LOE = td(01feb2005) if Product == "CELEXA" // https://investor.mylan.com/news-releases/news-release-details/mylan-receives-fda-approval-citalopram-hydrobromide-tablets
replace mkt_end = tq(2010q2) if Product == "MOBAN"	// https://psychnews.psychiatryonline.org/doi/full/10.1176/pn.45.5.psychnews_45_5_024

* save final version
save "${outdir}\main_SSR_2Q20.dta", replace




/////////////////////////////////////////////////////////////////
/////														/////
/////	1. Product level data from SSR Health on drug sales	/////
/////														/////
/////////////////////////////////////////////////////////////////

* this dataset has already been mostly cleaned
use "${outdir}\main_SSR_2Q20.dta", clear

* drop variables we will never use
drop Molecule Class biologic

* drop variables we are not going to use
drop tot_discount med_discount non_med_discount

* replace 0s with missing values
replace WAC_sales = . if WAC_sales == 0
replace net_sales = . if net_sales == 0
replace units = . if units == 0

* Light imputation step: Drop observations missing WAC at the beginning and end, 
* and fill in any eventual gaps
bysort Product : egen temp = min(quarter) if WAC_sales != . & units != .
bysort Product : egen first_qrt = min(temp)
drop temp
bysort Product : egen temp = max(quarter) if WAC_sales != . & units != .
bysort Product : egen last_qrt = min(temp)
drop temp

* drop products for which there is no data
drop if quarter < first_qrt | quarter > last_qrt
drop first_qrt last_qrt

* Fill in any eventual missing data
egen prodID = group(Product)
xtset prodID quarter
tsfill
bysort prodID (Product) : replace Product = Product[_N]
bysort prodID (LOE) : replace LOE = LOE[_N]

foreach var in WAC_sales units net_sales {
	bysort Product (quarter) : ipolate `var' quarter, gen(`var'_ipolate)
	sum `var'_ipolate if `var' == .
	replace `var' = `var'_ipolate if `var' == .
	drop `var'_ipolate
	}

* Extend the series by creating observations for the entire series (even if 
* products have not been launched yet). This minimizes matching issues down the
* line (e.g. if a product is acquired prior to sales being registered in SSR).
tsfill, full

bysort prodID (Product) : replace Product = Product[_N]
sort Product quarter
drop prodID

* update variables
bysort Product (Company) : replace Company = Company[_N]

foreach var in LOE mkt_start mkt_end {
	bysort Product : egen temp = max(`var')
	drop `var'
	rename temp `var'
	format `var' %td
	}

* Exclude post-exclusivity and tails without observations
drop if quarter >= qofd(LOE)	// exclude products post-exclusivity

bysort Product : egen max_quarter = max(quarter * (WAC < .))
drop if quarter > max_quarter

drop max_quarter

* Exclude pre-mkt_start quarters
bysort Product (quarter) : egen temp = min(quarter) if WAC_sales < .
bysort Product (quarter) : egen first_qrt = max(temp)
drop temp

replace mkt_start = dofq(first_qrt) if qofd(mkt_start) > first_qrt

drop if quarter < qofd(mkt_start)

* save
save "${outdir}\ProductLevelSalesSSR.dta", replace



/////////////////////////////////////////////////////////////////////////////
/////																	/////
/////	2. Product-company level data from SSR Health on drug net sales	/////
/////																	/////
/////////////////////////////////////////////////////////////////////////////

* import the raw data from excel
import excel ///
	"${inputdir}\US brand Rx net price tool _3Q19_ProductLevel_netSales.xlsx", ///
		sheet("Sheet1") firstrow clear

* keep only relevant variables: identifiers and net sales
keep Company Product net_sales* LOE

* reshape (each quarter currently has a separate variable)
reshape long net_sales, i(Company Product) j(quarterString) string

* Convert quarter to numeric
gen quarter = quarterly(quarterString,"Q20Y")
drop quarterString
format quarter %tq

* generate variable list each company's share of a product's net sales
bysort Product quarter : egen tot_sales = total(net_sales)
gen pct_sales = net_sales/tot_sales
drop tot_sales

* save
drop if quarter >= qofd(LOE)	// exclude products post-exclusivity
save "${outdir}\ProductCompanyLevelNetSalesSSR.dta", replace


/////////////////////////////////////////////////////////////
/////													/////
/////	3. Data from SSR Health on pharma acquisitions	/////
/////													/////
/////////////////////////////////////////////////////////////

import excel "${inputdir}\SSR_Merger_Info_3Q19.xlsx", ///
	sheet("Sheet1") firstrow clear

* Keep only drugs that experienced acquisition
drop if Acquisition1 == . & Acquisition2 == . & Divestiture == .

* harmonize company names
foreach var in Company Acquiredfrom1 Acquiredfrom2 Divestedto Product Molecule {
	replace `var' = trim(upper(`var'))
	}

* Imputation of missing/erroneous data

* Products with indication of partial rights
forvalues i = 1/2 {
	replace Acqinterest`i' = 0.5 if ///
		regexm(Acquiredfrom`i', "PARTIAL RIGHTS") & Acqinterest`i' == .

	replace Acqinterest`i' = 1 if ///
		regexm(Acquiredfrom`i', "FULL RIGHTS") & Acqinterest`i' == .
	
	}

* Imputation of data that is incorrect
replace Divestedto = Acquiredfrom1 if Product == "ELIGARD" // https://scrip.pharmaintelligence.informa.com/SC004854/QLT-sells-Eligard-franchise-to-Tolmar-in-$230-million-deal
replace Acquiredfrom1 = "" if Product == "ELIGARD"

replace Divestedto = Acquiredfrom1 if ///
	Product == "GLUMETZA" & Divestiture == td(30sep2013)	// http://investor.assertiotx.com/news-releases/news-release-details/depomed-sells-type-2-diabetes-royalties-and-milestones-pdl
replace Acquiredfrom1 = "" if ///
	Product == "GLUMETZA" & Divestiture == td(30sep2013)

replace Divestedto = Acquiredfrom1 if Product == "EGRIFTA"	// https://www.globenewswire.com/news-release/2014/05/01/1473876/0/en/Theratechnologies-Regains-Rights-to-EGRIFTA-R-tesamorelin-for-injection-in-the-United-States.html
replace Acquiredfrom1 = "" if Product == "EGRIFTA"

* Rename variable to create a structure more closely resembling SDC

/*	These substitutions are always made on the assumption that the Acquisition1 
	date comes BEFORE Divestiture (logically, a company must acquire a product 
	before divesting it). Therefore, there can be four possible sequences of 
	ownership:
	
	1. Company --> Divestedto
	2. Acquiredfrom1 --> Company
	3. Acquiredfrom2 --> Company
	4. Acquiredfrom1 --> Company --> Divestedto

*/


* single-acquisition drugs
gen AcquirorName1 = Company if Acquisition1 != . & ///
							   Divestiture == .

replace AcquirorName1 = Divestedto if Divestiture != . &	///
									  Acquisition1 == .

* Drugs with two acquisitions 
replace AcquirorName1 = Company if Acquisition1 != . & ///
								   Divestiture != .

* second acquisition
gen AcquirorName2 = Company if Acquisition1 != . & ///
							   Acquisition2 != . & ///
							   Divestiture == .

replace AcquirorName2 = Divestedto if Acquisition1 != . & ///
									  Acquisition2 == . & ///
									  Divestiture != .



* repeat the exercise for the target name
* single-acquisition drugs
gen TargetName1 = Acquiredfrom1 if Acquisition1 != . & ///
								   Divestiture == .

replace TargetName1 = Company if Divestiture != . &	///
								 Acquisition1 == .

* Drugs with two acquisitions 
replace TargetName1 = Acquiredfrom1 if Acquisition1 != . & ///
									   Divestiture != .

* second acquisition
gen TargetName2 = Acquiredfrom2 if Acquisition1 != . & ///
								   Acquisition2 != . & ///
								   Divestiture == .

replace TargetName2 = Company if Acquisition1 != . & ///
								 Acquisition2 == . & ///
								 Divestiture != .

* Interest variables
gen interest1 = Acqinterest1
replace interest1 = Interestatdivest if Acquisition1 == . & ///
										Divestiture != .

gen interest2 = Acqinterest2
replace interest2 = Interestatdivest if Acquisition1 == . & ///
										Divestiture != .


* Finally, harmonize dates using the same format
gen Date1 = Acquisition1 if Acquisition1 != .
replace Date1 = Divestiture if Acquisition1 == . & ///
							   Divestiture != .

gen Date2 = Acquisition2
replace Date2 = Divestiture if Acquisition1 != . & ///
							   Divestiture != .


* drop variables that now report duplicate information
drop Acquisition1 Acquisition2 Divestiture Acquiredfrom1 Acquiredfrom2 ///
	Divestedto Acqinterest1 Acqinterest2 Interestatdivest
* drop variables we don't care about carrying over
drop Subsector Ticker Size Class Marketstart Marketend Exclude

* reshape
reshape long AcquirorName TargetName interest Date, i(Company Product) j(n)
drop n Company
drop if AcquirorName == ""
format Date %td

* generate year and year-month of deal
gen year = year(Date)
gen ym = mofd(Date)

* save
save "${outdir}\AcquisitionDataSSR.dta", replace



/////////////////////////////////////////////////////
/////											/////
/////	4. Data from SDC on pharma acquisitions	/////
/////											/////
/////////////////////////////////////////////////////

* Import SDC data
import delimited "${inputdir}\pharma_ma_all_2000_2020_v2.csv", ///
	delimiter(comma) stringcols(_all) varnames(1) clear
	
drop if dateeffective == ""

* cosmetic edits
gen TargetName = upper(trim(targetname))
drop targetname
gen AcquirorName = upper(trim(acquirorname))
drop acquirorname

replace valueoftransactionmil = subinstr(valueoftransactionmil,",","",.)
destring valueoftransactionmil, gen(ValueofTransactionMil)
drop valueoftransactionmil
destring ofsharesacq, gen(SharesAcq)
drop ofsharesacq
destring ownedaftertransaction, gen(OwnedAfterTransaction)
drop ownedaftertransaction

* fix up dates
gen dateAnnounced = date(dateannounced,"MDY")
gen dateEffective = date(dateeffective,"MDY")
drop dateannounced dateeffective
format date* %td

* drop dates before 01Jan2007
drop if dateEffective <= td(31dec2006)

* generate year and year-month of deal
gen year = year(dateEffective)
gen ym = mofd(dateEffective)

* save
save "${outdir}\AcquisitionDataSDC.dta", replace

* As a final step we append the two acquisition datasets and format them for 
* manual comparison
append using "${outdir}\AcquisitionDataSSR.dta"
format ym %tm
gen meshDate = Date
replace meshDate = dateEffective if meshDate == .

order TargetName ym AcquirorName meshDate Date dateEffective
sort ym meshDate TargetName

save "${outdir}/appendedAcquisitionData.dta", replace
